
print("version 3")
library(sandwich)

options(warn=1)
args = commandArgs(trailingOnly=TRUE)
if (length(args)!=2) {
  stop("Missing arguments: Provide split number (0 or 1) and path to root directory")
}

trainingIndicator <- as.numeric(args[1])
repoRootDir <- as.character(args[2])

if ( !(trainingIndicator == 0 || trainingIndicator == 1) ) {
  print(trainingIndicator)
  stop("Invalid argument values")
}

if (trainingIndicator == 1) {
  outputSaveFilepath <- file.path(repoRootDir,"output","split1.csv")
} else {
  outputSaveFilepath <- file.path(repoRootDir,"output","split0.csv")
}


kINPUTDATADIR <- file.path(repoRootDir,"data/datasets")


lambdaValueGLOBAL <- 1

set.seed(7)


calcSummaryOutputValues <- function(x, t, n, trueBetaB, trueBetaW=FALSE, datasetLabel="NO_DATASET_LABEL_PROVIDED") {  #, shouldPrint, lminvalue=1e-9, umaxvalue=(1-1e-7)) {
  # Right now the trueBetaW=FALSE works, but is strange since two types are possible (boolean or vector)
  
  l=min(x)+0.00001  #lower end of x range where we assume linear contextual model
  u=max(x)-0.00001  #upper end of x range where we assume linear contextual model
  
  p=length(x) # number of precincts, in practice should be observed
   
  #plot(x,t)
  bd=sum(round(n*x)*trueBetaB)/sum(round(n*x)) #true district b
  ################################################################################
  r=sum(n*x*(1-x))/sum(n*x)
  #lb=1
  lb=lambdaValueGLOBAL
  #this is our choice of lambda
  h0=lb*sum(n*x*t)/sum(n*x)
  s1= sqrt(sum((n*x*((1+lb)/2-lb*x))^2)/(sum(n*x))^2)
  cl=0.90
  #this is conf level
  alp=1-cl
  #ex=qnorm(1-alp/4)
  ex=1
  gl01=0
  gl1=c(-1/l,0,0)
  gl02=-1/(1-l)
  gl2=c(1/(1-l),1/(1-l),1/(1-l)-1)
  gu01=1/l
  gu1=c(-1/l,0,0)
  gu02=0
  gu2=c(1/(1-l),1/(1-l),1/(1-l)-1)
  ###
  gl03=0
  gl3=c(-1/u,0,0)
  gl04=-1/(1-u)
  gl4=c(1/(1-u),1/(1-u),1/(1-u)-1)
  gu03=1/u
  gu3=c(-1/u,0,0)
  gu04=0
  gu4=c(1/(1-u),1/(1-u),1/(1-u)-1)
  h=c(1-lb, sum(n*x*(1-lb*x))/sum(n*x),sum(n*x^2*(1-lb*x))/sum(n*x))
  ###### 
  # bu= t/x
  # bu=bu*1*(bu<1)+1*(bu>1) #duncan-davis upperbound at precinct level
  # bl= (t-(1-x))/x
  # bl= 0*1*(bl<0)+bl*1*(bl>0)#duncan-davis lowerbound at precinct level
  
  bl <- numeric(p)
  bu <- numeric(p)
  for (i in 1:p) {
    lowerBound <- max( 0, (t[i] + x[i] - 1) / x[i] )
    upperBound <- min(1, t[i] / x[i] )
    if (!is.finite(lowerBound)) {
      lowerBound <- 0.0
    }  
    if (!is.finite(upperBound)) {
      upperBound <- 1.0
    }
    bl[i] <- lowerBound
    bu[i] <- upperBound
  }
  
  
  #in practice w0 should be estimated by regressing t<-w0+c1x+d1x^2
  #d1=b1-w1 #in practice d1 should be estimated by regressing t<-w0+c1x+d1x^2
  #c1=b0-w0+w1  #in practice c1 should be estimated by regressing t<-w0+c1x+d1x^2
  reg=lm(formula = t ~ poly(x, 2,raw=TRUE))#do quadratic regression here
  w0=coef(reg)[1]
  c1=coef(reg)[2]
  d1=coef(reg)[3]
  #w0=w0+0.002
  
  
  
  # # account for 0's:
  # bl[!is.finite(bl)] <- 0.0
  # bu[!is.finite(bu)] <- 1.0
  
  
  
  #plot(x,t,xlim=c(0,1),ylim=c(0,1), ylab="t" )
  xx=rep(1:100)/100
  #lines(xx,w0+c1*xx+d1*xx^2 )
  #th=coef(reg)
  th=c(w0,c1,d1)
  #library(sandwich)
  v=vcovHC(reg,"HC1")
  #this gives sandwich variance of STATA see 
  #https://stats.stackexchange.com/questions/117052/replicating-statas-robust-option-in-r
  #information re robust sandwich variance formula:
  #https://www.stata.com/manuals/p_robust.pdf
  #
  sl1=s1+sqrt( t(h-r*gu1)%*%v%*% (h-r*gu1))
  sl2=s1+sqrt( t(h-r*gu2)%*%v%*% (h-r*gu2))
  sl3=s1+sqrt( t(h-r*gu3)%*%v%*% (h-r*gu3))
  sl4=s1+sqrt( t(h-r*gu4)%*%v%*% (h-r*gu4))
  sl=c(sl1,sl2,sl3,sl4)
  su1=s1+sqrt( t(h-r*gl1)%*%v%*% (h-r*gl1))
  su2=s1+sqrt( t(h-r*gl2)%*%v%*% (h-r*gl2))
  su3=s1+sqrt( t(h-r*gl3)%*%v%*% (h-r*gl3))
  su4=s1+sqrt( t(h-r*gl4)%*%v%*% (h-r*gl4))
  su=c(su1,su2,su3,su4)
  bl1=h0-r*gu01+t(h-r*gu1)%*%th
  bl2=h0-r*gu02+t(h-r*gu2)%*%th
  bl3=h0-r*gu03+t(h-r*gu3)%*%th
  bl4=h0-r*gu04+t(h-r*gu4)%*%th
  bbl=c(bl1,bl2,bl3,bl4)
  bu1=h0-r*gl01+t(h-r*gl1)%*%th
  bu2=h0-r*gl02+t(h-r*gl2)%*%th
  bu3=h0-r*gl03+t(h-r*gl3)%*%th
  bu4=h0-r*gl04+t(h-r*gl4)%*%th
  bbu=c(bu1,bu2,bu3,bu4)
  
  wuc=c(gu01+t(gu1)%*%th,gu02+t(gu2)%*%th,gu03+t(gu3)%*%th,gu04+t(gu4)%*%th)
  wu=min(gu01+t(gu1)%*%th,gu02+t(gu2)%*%th,gu03+t(gu3)%*%th,gu04+t(gu4)%*%th)
  #upperbound of w1
  wl=max(gl01+t(gl1)%*%th,gl02+t(gl2)%*%th,gl03+t(gl3)%*%th,gl04+t(gl4)%*%th)
  #lowerbound of w1
  
  #cil=max(bl1-ex*sl1,bl2-ex*sl2) #lower end of CI_ex
  #cir=min(bu1+ex*su1,bu2+ex*su2) #upper end of CI_ex
  cil=max(bbl)-ex*sl[which.max(bbl)]
  cir=min(bbu)+ex*su[which.min(bbu)]
   
  #these are the conservative ci at ex=1.
  #######################################################################################
    
  hbdu0=min(bbu) #hat district level upperbound proposed (wtd)
  hbdl0=max(bbl) #hat district level lowererbound proposed (wtd)
   
  bdu=sum(n*x*bu)/sum(n*x) # district level upperbound by duncan-davis (wtd)
  bdl=sum(n*x*bl)/sum(n*x) # district level upperbound by duncan-davis (wtd)
  greg=lm(formula = t ~ poly(x, 1,raw=TRUE))#do goodman linear regression here
  ag=coef(greg)[1]
  bg=coef(greg)[2]
  gdmn=ag+bg #goodman result
  wk1=sum(n*x*(t+(1-x)*(c1+d1*x)))/sum(n*x) #wakeman w1=0
  wk2=sum(n*x*(t+(1-x)*(c1+d1*x+d1/2)))/sum(n*x) #wakeman w1=-d1/2
  #bdu-bdl  #width of interval district param, by duncan-davis
  # 
  #this width ratio measures efficiency of proposed interval
  #(hbdu0-hbdl0)/(bdu-bdl)
  #bd #true district value

  #xx=min(x)+(max(x)-min(x))*rep(0:1000)/1000
  #max(w0+c1*xx+d1*xx^2)
  #min(w0+c1*xx+d1*xx^2)
  
  reg3=lm(formula = t ~ poly(x, 3,raw=TRUE))
  
   w30=coef(reg3)[1]
   w31=coef(reg3)[2]
   w32=coef(reg3)[3]
   w33=coef(reg3)[4]
  v3=vcovHC(reg3,"HC1")
  t3=w33/sqrt(v3[4,4])
  #curve(w30+w31*x+w32*x^2+w33*x^3)
  #curve(w0+c1*x+d1*x^2,add=T)
  #print("start loess(t ~ x)")
   lo=loess(t ~ x, span=1)
  #print ("end loess(t ~ x)")
   #lp=predict(lo, se = TRUE)
  ss.y=sum(scale(t, scale=FALSE)^2)
  ss.reslo <- sum(resid(lo)^2)
  ss.res3 <- sum(resid(reg3)^2)
  ss.res2 <- sum(resid(reg)^2)
  
  #### additional values
  # beta B
  #print("start loess(trueBetaB ~ x)")
  loessWithTrueBetaB <- loess(trueBetaB ~ x, span=1)
  #print("end loess(trueBetaB ~ x)")
  ss.res.loessWithTrueBetaB <- sum(resid(loessWithTrueBetaB)^2)
  
  regBetaBVsX <- lm(trueBetaB ~ x)
  ss.res.regBetaBVsX <- sum(resid(regBetaBVsX)^2)
  
  regBetaBVsXvcov=vcovHC(regBetaBVsX,"HC1")
  regBetaBVsXvcov.tstat <- coef(regBetaBVsX)[2]/sqrt(regBetaBVsXvcov[2,2])
  
  sx <- summary(regBetaBVsX)
  regBetaBVsX.lmsummary.tstat <- sx$coefficients[2, "t value"]
  
  # beta W
  if (length(trueBetaW) <= 1) {
    stopifnot(!trueBetaW)
    print("trueBetaW not provided")
    trueBetaW <- (t-trueBetaB*x)/(1-x)
  }
  #print("start loess(trueBetaW ~ x)")
  loessWithTrueBetaW <- loess(trueBetaW ~ x, span=1)
  #print("end loess(trueBetaW ~ x)")
  ss.res.loessWithTrueBetaW <- sum(resid(loessWithTrueBetaW)^2)
  
  regBetaWVsX <- lm(trueBetaW ~ x)
  ss.res.regBetaWVsX <- sum(resid(regBetaWVsX)^2)
  
  regBetaWVsXvcov=vcovHC(regBetaWVsX,"HC1")
  regBetaWVsXvcov.tstat <- coef(regBetaWVsX)[2]/sqrt(regBetaWVsXvcov[2,2])
  
  sx <- summary(regBetaWVsX)
  regBetaWVsX.lmsummary.tstat <- sx$coefficients[2, "t value"]    
  # } else {
  #   ss.res.loessWithTrueBetaW <- NaN
  #   ss.res.regBetaWVsX <- NaN
  #   regBetaWVsXvcov.tstat <- NaN
  #   regBetaWVsX.lmsummary.tstat <- NaN
  # }

  #below is new as of Jan 4#quadratic and linear regressions of b and w vs x.
  regb2=lm(formula = trueBetaB ~ poly(x, 2,raw=TRUE))
  b20=coef(regb2)[1]
  b21=coef(regb2)[2]
  b22=coef(regb2)[3]
  vb2=vcovHC(regb2,"HC1")
  tb2=b22/sqrt(vb2[3,3])
  #
  regw2=lm(formula = trueBetaW ~ poly(x, 2,raw=TRUE))
  w20=coef(regw2)[1]
  w21=coef(regw2)[2]
  w22=coef(regw2)[3]
  vw2=vcovHC(regw2,"HC1")
  tw2=w22/sqrt(vw2[3,3])
  #
  ##
  regb1=lm(formula = trueBetaB ~ poly(x, 1,raw=TRUE))
  b10=coef(regb1)[1]
  b11=coef(regb1)[2]
  vb1=vcovHC(regb1,"HC1")
  tb1=b11/sqrt(vb1[2,2])
  #
  regw1=lm(formula = trueBetaW ~ poly(x, 1,raw=TRUE))
  w10=coef(regw1)[1]
  w11=coef(regw1)[2]
  vw1=vcovHC(regw1,"HC1")
  tw1=w11/sqrt(vw1[2,2])
  
  # #output the following:
  # tb2#t-statsitic for the quadratic effect of b vs x
  # tw2#t-statsitic for the quadratic effect of w vs x
  # tb1#t-statsitic for the linear effect of b vs x
  # tw1#t-statsitic for the linear effect of w vs x
  corBetaBandBetaW <- cor(trueBetaB, trueBetaW)
  outputString <- sprintf("%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %s", bd, hbdl0, hbdu0, cil, cir, bdl, bdu, t3, ss.y, ss.reslo, ss.res2, p, w0, c1, d1, l, u, w30, w31, w32, w33, ss.res.loessWithTrueBetaB, ss.res.regBetaBVsX, regBetaBVsXvcov.tstat, regBetaBVsX.lmsummary.tstat, ss.res.loessWithTrueBetaW, ss.res.regBetaWVsX, regBetaWVsXvcov.tstat, regBetaWVsX.lmsummary.tstat, tb2, tw2, tb1, tw1, corBetaBandBetaW, datasetLabel) #sprintf("%s,%s,%s", datasetLabel, tb1==regBetaBVsXvcov.tstat,tw1==regBetaWVsXvcov.tstat))
  return(outputString)
}


updateDataframeAndCtr <- function(x, t, n, trueBetaB, trueBetaW=FALSE, datasetLabel="NO_DATASET_LABEL_PROVIDED") {
  datasetSummaryDataframe$summary[dataSetCtr] <<- calcSummaryOutputValues(x, t, n, trueBetaB, trueBetaW, datasetLabel)
  dataSetCtr <<- dataSetCtr + 1
}


updateWithIPUMSData <- function(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel) {
  if (datasetInTrainingGLOBAL == trainingIndicator) { # here, 1 indicates training data
    for (oneYearLabel in yearLabels) {
      print(sprintf("Currently processing %s year %d", datasetLabel, oneYearLabel))
      datasetNameLabel <- sprintf(dataFiles[1], oneYearLabel)
      dataFile <- file.path(dataDir, datasetNameLabel) 
      
      inputDataSet <- read.table(dataFile, header = TRUE, sep = ",")
      
      #[1] "X"    "notX" "T"    "notT" "W1"   "W2"   "N"  
      
      x <- inputDataSet$X
      t <- inputDataSet$T
      n <- inputDataSet$N
      
      trueBetaB <- inputDataSet$W1
      trueBetaW <- inputDataSet$W2
      
      # check data consistency:
      checkT <- x * trueBetaB + (1-x) * trueBetaW
      equalCheck <- all.equal(t, checkT, tolerance=0.00001)
      stopifnot(isTRUE(equalCheck))
      
      updateDataframeAndCtr(x, t, n, trueBetaB, trueBetaW, sprintf(datasetLabel, oneYearLabel, blabel))
      updateDataframeAndCtr(1-x, t, n, trueBetaW, trueBetaB, sprintf(datasetLabel, oneYearLabel, wlabel))
      
    }
  }
}

calculateBetaWFromBetaB <- function(x, t, n, trueBetaB) {
  # n is not currently used
  return((t-trueBetaB*x)/(1-x))
}

updateWithSencData <- function(x, t, n, trueBetaB, datasetLabel) {
  updateDataframeAndCtr(x, t, n, trueBetaB, FALSE, sprintf(datasetLabel, ""))
  updateDataframeAndCtr(1-x, t, n, calculateBetaWFromBetaB(x, t, n, trueBetaB), trueBetaB, sprintf(datasetLabel, "not"))
}

# setup dataframe:
dataSetCtr <- 1 # header is at index 1
numDataSets <- 10000 # this is reduced below
datasetSummaryDataframe <- data.frame(c(numeric(numDataSets)))
colnames(datasetSummaryDataframe)[dataSetCtr] <- "summary"
datasetSummaryDataframe$summary[dataSetCtr] <- sprintf("unique_id, bd, hbdl0, hbdu0, cil, cir, bdl, bdu, t3, ss.y, ss.reslo, ss.res2, p, w0, c1, d1, l, u, w30, w31, w32, w33, ss.res.loessWithTrueBetaB, ss.res.regBetaBVsX, regBetaBVsXvcov.tstat, regBetaBVsX.lmsummary.tstat, ss.res.loessWithTrueBetaW, ss.res.regBetaWVsX, regBetaWVsXvcov.tstat, regBetaWVsX.lmsummary.tstat, tb2, tw2, tb1, tw1, corTruebetaBandTruebetaW, dataset_label")

dataSetCtr <- dataSetCtr + 1


################## Non-IPUMS data:
if (trainingIndicator == 1) {
  ##############################################################################
  # census data
  ##########################
  
  library("eco")
  data("census")
  inputDataSet <- census
  
  x <- inputDataSet$X
  t <- inputDataSet$Y
  n <- inputDataSet$N
  
  trueBetaB <- inputDataSet$W1
  trueBetaW <- inputDataSet$W2
  
  
  ## update data structures:
  datasetLabel <- "Census data (Black Literacy Rates in 1910 Census): Males"
  datasetSummaryDataframe$summary[dataSetCtr] <- calcSummaryOutputValues(x, t, n, trueBetaB, trueBetaW, datasetLabel)
  dataSetCtr <- dataSetCtr + 1
  datasetLabel <- "Census data (Black Literacy Rates in 1910 Census): Females"
  datasetSummaryDataframe$summary[dataSetCtr] <- calcSummaryOutputValues(1-x, t, n, trueBetaW, trueBetaB, datasetLabel)
  dataSetCtr <- dataSetCtr + 1
  ##
  
  
  
  
  ##############################################################################
  # matproii data
  ##########################
  library(ei)
  data(matproii)
  inputDataSet <- matproii
  
  x <- inputDataSet$x
  t <- inputDataSet$t
  n <- inputDataSet$n
  
  trueBetaB <- inputDataSet$tb
  trueBetaW <- inputDataSet$tw
  
  ## update data structures:
  datasetLabel <- "Voter Registration by Race in Southern States (matproii): Blacks"
  datasetSummaryDataframe$summary[dataSetCtr] <- calcSummaryOutputValues(x, t, n, trueBetaB, trueBetaW, datasetLabel)
  dataSetCtr <- dataSetCtr + 1
  datasetLabel <- "Voter Registration by Race in Southern States (matproii): Whites"
  datasetSummaryDataframe$summary[dataSetCtr] <- calcSummaryOutputValues(1-x, t, n, trueBetaW, trueBetaB, datasetLabel)
  dataSetCtr <- dataSetCtr + 1
  ##
  
  
  ##############################################################################
  # senc data
  ##########################
  
  
  library("eco")
  #library(ei)
  library(ei)
  library(eiPack)
  data(senc)
  
  trueBetaWhiteDem <- senc$whdem / (senc$white+0.0000001) #senc$total
  trueBetaWhiteRep <- senc$whrep / (senc$white+0.0000001)
  trueBetaWhiteNon <- senc$whnon / (senc$white+0.0000001)
  
  trueBetaBlackDem <- senc$bldem / (senc$black+0.0000001)
  trueBetaBlackRep <- senc$blrep / (senc$black+0.0000001)
  trueBetaBlackNon <- senc$blnon / (senc$black+0.0000001)
  
  trueBetaNatamDem <- senc$natamdem / (senc$natam+0.0000001)
  trueBetaNatamRep <- senc$natamrep / (senc$natam+0.0000001)
  trueBetaNatamNon <- senc$natamnon / (senc$natam+0.0000001)
  
  n <- senc$total
  inputT1 <- senc$dem / senc$total
  inputT2 <- senc$rep / senc$total
  inputT3 <- senc$non / senc$total
  inputX1 <- senc$white / senc$total
  inputX2 <- senc$black / senc$total
  inputX3 <- senc$natam / senc$total
  
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (White-Dem)"
  updateWithSencData(inputX1, inputT1, n, trueBetaWhiteDem, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (White-Rep)"
  updateWithSencData(inputX1, inputT2, n, trueBetaWhiteRep, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (White-Non)"
  updateWithSencData(inputX1, inputT3, n, trueBetaWhiteNon, datasetLabel)
  
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Black-Dem)"
  updateWithSencData(inputX2, inputT1, n, trueBetaBlackDem, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Black-Rep)"
  updateWithSencData(inputX2, inputT2, n, trueBetaBlackRep, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Black-Non)"
  updateWithSencData(inputX2, inputT3, n, trueBetaBlackNon, datasetLabel)
  
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Dem)"
  updateWithSencData(inputX3, inputT1, n, trueBetaNatamDem, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Rep)"
  updateWithSencData(inputX3, inputT2, n, trueBetaNatamRep, datasetLabel)
  datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Non)"
  updateWithSencData(inputX3, inputT3, n, trueBetaNatamNon, datasetLabel)
  
  
  
  # # dropping precincts with 10 or fewer people in the population of interest:
  # senc$natam < 10
  # inputX3[senc$natam > 10]
  # populationMask <- senc$natam > 10
  # datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Dem)--dropping precincts with 10 or fewer people in the population of interest"
  # updateWithSencData(inputX3[populationMask], inputT1[populationMask], n[populationMask], trueBetaNatamDem[populationMask], datasetLabel)
  # datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Rep)--dropping precincts with 10 or fewer people in the population of interest"
  # updateWithSencData(inputX3[populationMask], inputT2[populationMask], n[populationMask], trueBetaNatamRep[populationMask], datasetLabel)
  # datasetLabel <- "Treating the ‘senc’ data as 2x2 datasets: 'senc' data: Row1 %s Column1 (Natam-Non)--dropping precincts with 10 or fewer people in the population of interest"
  # updateWithSencData(inputX3[populationMask], inputT3[populationMask], n[populationMask], trueBetaNatamNon[populationMask], datasetLabel)
  
  
  
  
  
  
  # ##############################################################################
  # CDC
  # ##########################
  dataFile <- file.path(kINPUTDATADIR,"UnderlyingCauseofDeath1999-2015_bygender_black_white_agelessthan14.formatted.txt.xtnformatted.txt") 
  cdcData <- read.table(dataFile, header = TRUE, sep = ",")
  
  #[1] "X"    "notX" "T"    "notT" "W1"   "W2"   "N"  
  
  inputDataSet <- cdcData
  
  x <- inputDataSet$X
  t <- inputDataSet$T
  n <- inputDataSet$N
  
  trueBetaB <- inputDataSet$W1
  trueBetaW <- inputDataSet$W2
  
  datasetLabel <- "CDC data 1: Deaths from 1999 to 2015 among blacks and whites aged 14 and under by GENDER: Males"
  updateDataframeAndCtr(x, t, n, trueBetaB, trueBetaW, datasetLabel)
  datasetLabel <- "CDC data 1: Deaths from 1999 to 2015 among blacks and whites aged 14 and under by GENDER: Females"
  updateDataframeAndCtr(1-x, t, n, trueBetaW, trueBetaB, datasetLabel)
  
  # ##############################################################################
  # CDC
  # ##########################
  dataFile <- file.path(kINPUTDATADIR,"UnderlyingCauseofDeath1999-2015_byrace_black_white_agelessthan14.formatted.txt.xtnformatted.txt")
  cdcData <- read.table(dataFile, header = TRUE, sep = ",")
  
  #[1] "X"    "notX" "T"    "notT" "W1"   "W2"   "N"  
  
  inputDataSet <- cdcData
  
  x <- inputDataSet$X
  t <- inputDataSet$T
  n <- inputDataSet$N
  
  trueBetaB <- inputDataSet$W1
  trueBetaW <- inputDataSet$W2
  
  datasetLabel <- "CDC data 2: Deaths from 1999 to 2015 among blacks and whites aged 14 and under by RACE: Blacks"
  updateDataframeAndCtr(x, t, n, trueBetaB, trueBetaW, datasetLabel)
  datasetLabel <- "CDC data 2: Deaths from 1999 to 2015 among blacks and whites aged 14 and under by RACE: Whites"
  updateDataframeAndCtr(1-x, t, n, trueBetaW, trueBetaB, datasetLabel)
  
  
  # ##############################################################################
  # India
  # ##########################
  dataDir <- kINPUTDATADIR
  dataFiles <- c("india_bygender_edu_attendance_Total.csv", "india_bygender_edu_attendance_Rural.csv", "india_bygender_edu_attendance_Urban.csv", "india_bygender_edu_literacy_Total.csv", "india_bygender_edu_literacy_Rural.csv", "india_bygender_edu_literacy_Urban.csv")
  dataFilesLabels <- c("Attendance of Educational Institutions 2001 by gender (for RURAL and URBAN areas within districts of India)", 
                       "Attendance of Educational Institutions 2001 by GENDER (for RURAL areas within districts of India)", 
                       "Attendance of Educational Institutions 2001 by GENDER (for URBAN areas within districts of India)", 
                       "Literacy (among those attending educational institutions) 2001 by GENDER (for RURAL and URBAN areas within districts of India)", 
                       "Literacy (among those attending educational institutions) 2001 by GENDER (for RURAL areas within districts of India)", 
                       "Literacy (among those attending educational institutions) 2001 by GENDER (for URBAN areas within districts of India)")
  
  for (datasetIntLabel in 1: 6) {
    dataFile <- file.path(dataDir, dataFiles[datasetIntLabel]) 
    
    inputDataSet <- read.table(dataFile, header = TRUE, sep = ",")
    
    #[1] "X"    "notX" "T"    "notT" "W1"   "W2"   "N"  
    
    x <- inputDataSet$X
    t <- inputDataSet$T
    n <- inputDataSet$N
    
    trueBetaB <- inputDataSet$W1
    trueBetaW <- inputDataSet$W2
    
    datasetLabel <- sprintf("India data %d: %s: B", datasetIntLabel, dataFilesLabels[datasetIntLabel])
    updateDataframeAndCtr(x, t, n, trueBetaB, trueBetaW, datasetLabel)
    datasetLabel <- sprintf("India data %d: %s: W", datasetIntLabel, dataFilesLabels[datasetIntLabel])
    updateDataframeAndCtr(1-x, t, n, trueBetaW, trueBetaB, datasetLabel)
  }
}


# ##############################################################################
# IPUMS -- MERGED: Full datasets and Census samples and ACS samples
# ##########################

##################      literacy by GENDER

datasetInTrainingGLOBAL <- 0
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_literate_notliterate__x-is-men_t-is-LIT.csv")
yearLabels <- c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930)

datasetLabel <- "IPUMS census (by county) data: Literacy by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


# by MCD

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_menwomen_literate_notliterate__x-is-men_t-is-LIT.csv")
yearLabels <- c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930)

datasetLabel <- "IPUMS census (by MCD) data: Literacy by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_menwomen_literate_notliterate__x-is-men_t-is-LIT.csv")
yearLabels <- c(1850,1900,1910,1920,1930)
datasetLabel <- "IPUMS full census (by county) data: Literacy by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      literacy by race

datasetInTrainingGLOBAL <- 0
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_literate_notliterate__x-is-black_t-is-LIT.csv")
yearLabels <- c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930)

datasetLabel <- "IPUMS census (by county) data: Literacy by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_blackwhite_literate_notliterate__x-is-black_t-is-LIT.csv")
yearLabels <- c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930)

datasetLabel <- "IPUMS census (by MCD) data: Literacy by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_literate_notliterate__x-is-black_t-is-LIT.csv")
yearLabels <- c(1850,1900,1910,1920,1930)
datasetLabel <- "IPUMS full census (by county) data: Literacy by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      LABFORCE by RACE

datasetInTrainingGLOBAL <- 1
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv")
yearLabels <- c(1850,1860,1870,1880,1900,1910,1920,1930,1940,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Work Force Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv")
yearLabels <- c(1850,1860,1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: Work Force Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Work Force Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv")
yearLabels <- c(1850,1880,1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Work Force Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


# full 1880
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_by_mcd_year%d_blackwhite_notinlaborforce_inlaborforce__x-is-black_t-is-LABFORCE.csv")
yearLabels <- c(1880)
datasetLabel <- "IPUMS full census (by MCD) data: Work Force Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)



##################      LABFORCE GENDER

# not using 1850, because no women
datasetInTrainingGLOBAL <- 0
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv")
yearLabels <- c(1860,1870,1880,1900,1910,1920,1930,1940,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Work Force Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv")
yearLabels <- c(1860,1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: Work Force Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Work Force Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv")
# not using 1850, because no women
yearLabels <- c(1880,1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Work Force Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###

# full 1880

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_by_mcd_year%d_menwomen_notinlaborforce_inlaborforce__x-is-men_t-is-LABFORCE.csv")
yearLabels <- c(1880)
datasetLabel <- "IPUMS full census (by MCD) data: Work Force Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###

##################      literacy by foreign born status

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_foreignborn_nativeborn_literate_notliterate_x-is-foreignborn_t-is-LIT.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by county) data: Literacy by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_foreignborn_nativeborn_literate_notliterate_x-is-foreignborn_t-is-LIT.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: Literacy by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_foreignborn_nativeborn_literate_notliterate_x-is-foreignborn_t-is-LIT.csv")
yearLabels <- c(1900,1910,1920,1930)
datasetLabel <- "IPUMS full census (by county) data: Literacy by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)



##################      English speaking ability by foreign born status

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_foreignborn_nativeborn_speaksenglish_doesnotspeakenglish_x-is-foreignborn_t-is-SPEAKENG.csv")
yearLabels <- c(1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by county) data: SPEAKENG by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_foreignborn_nativeborn_speaksenglish_doesnotspeakenglish_x-is-foreignborn_t-is-SPEAKENG.csv")
yearLabels <- c(1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: SPEAKENG by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_foreignborn_nativeborn_speaksenglish_doesnotspeakenglish_x-is-foreignborn_t-is-SPEAKENG.csv")
yearLabels <- c(1900,1910,1920,1930)
datasetLabel <- "IPUMS full census (by county) data: SPEAKENG by NATIVITY: %d: %s"
blabel <- "Foreignborn"
wlabel <- "Notforeignborn"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      NATIVITY by RACE

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930,1960)

datasetLabel <- "IPUMS census (by county) data: Foreign born status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: Foreign born status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv")
yearLabels <- c(1880,1900,1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Foreign born status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###

# full 1880

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_by_mcd_year%d_blackwhite_foreignborn_nativeborn__x-is-black_t-is-NATIVITY.csv")
yearLabels <- c(1880)
datasetLabel <- "IPUMS full census (by MCD) data: Foreign born status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###

##################      NATIVITY by GENDER

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930,1960)

datasetLabel <- "IPUMS census (by county) data: Foreign born status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD
dataDir <- kINPUTDATADIR
# note typo in filename leaves off menwomen_:
dataFiles <- c("ipums_sample_census_by_mcd_year%d_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv")
yearLabels <- c(1870,1880,1900,1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: Foreign born status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

###
## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_menwomen_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv")
yearLabels <- c(1880,1900,1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Foreign born status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# full 1880

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_by_mcd_year%d_menwomen_foreignborn_nativeborn__x-is-men_t-is-NATIVITY.csv")
yearLabels <- c(1880)
datasetLabel <- "IPUMS full census (by MCD) data: Foreign born status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      CLASSWKR by GENDER

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_selfemployed_worksforwages__x-is-men_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930,1940,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: CLASSWKR by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_menwomen_selfemployed_worksforwages__x-is-men_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: CLASSWKR by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


# by ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_selfemployed_worksforwages__x-is-men_t-is-CLASSWKR.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: CLASSWKR by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_menwomen_selfemployed_worksforwages__x-is-men_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: CLASSWKR by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

##################      CLASSWKR by RACE

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_selfemployed_worksforwages__x-is-black_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: CLASSWKR by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by MCD:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_by_mcd_year%d_blackwhite_selfemployed_worksforwages__x-is-black_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930)

datasetLabel <- "IPUMS census (by MCD) data: CLASSWKR by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


# by ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_selfemployed_worksforwages__x-is-black_t-is-CLASSWKR.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: CLASSWKR by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_selfemployed_worksforwages__x-is-black_t-is-CLASSWKR.csv")
yearLabels <- c(1910,1920,1930,1940)
datasetLabel <- "IPUMS full census (by county) data: CLASSWKR by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

##################      Vet status by race

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_vet_notvet__x-is-black_t-is-vetstat.csv")
yearLabels <- c(1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Veteran Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_vet_notvet__x-is-black_t-is-vetstat.csv")
# 2016 was not saved:
#yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015)
datasetLabel <- "IPUMS ACS (by county) data: Veteran Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_vet_notvet__x-is-black_t-is-vetstat.csv")
yearLabels <- c(1940)
datasetLabel <- "IPUMS full census (by county) data: Veteran Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

##################      EMPSTAT

datasetInTrainingGLOBAL <- 0
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_unemployed_employed__x-is-black_t-is-EMPSTAT.csv")
yearLabels <- c(1930,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Unemployment Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_unemployed_employed__x-is-black_t-is-EMPSTAT.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Unemployment Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_blackwhite_unemployed_employed__x-is-black_t-is-EMPSTAT.csv")
yearLabels <- c(1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Unemployment Status by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)



##################      EMPSTAT GENDER

datasetInTrainingGLOBAL <- 1
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_unemployed_employed__x-is-men_t-is-EMPSTAT.csv")
yearLabels <- c(1930,1950,1960,1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Unemployment Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_unemployed_employed__x-is-men_t-is-EMPSTAT.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Unemployment Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

## full dataset:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_full_census_year%d_menwomen_unemployed_employed__x-is-men_t-is-EMPSTAT.csv")
yearLabels <- c(1930,1940)
datasetLabel <- "IPUMS full census (by county) data: Unemployment Status by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      English speaking ability by citizenship status of foreign born

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_foreignbornnoncitizen_foreignborncitizen_doesnotspeakenglishverywell_speaksverywell__x-is-foreignbornnoncitizen_t-is-SPEAKENG.csv")
yearLabels <- c(1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Does not speak English very well SPEAKENG by Foreign born non-citizen vs foreign born citizen CITIZEN: %d: %s"
blabel <- "ForeignbornNonCitizen"
wlabel <- "ForeignbornCitizen"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_foreignbornnoncitizen_foreignborncitizen_doesnotspeakenglishverywell_speaksverywell__x-is-foreignbornnoncitizen_t-is-SPEAKENG.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Does not speak English very well SPEAKENG by Foreign born non-citizen vs foreign born citizen CITIZEN: %d: %s"
blabel <- "ForeignbornNonCitizen"
wlabel <- "ForeignbornCitizen"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)



##################      English speaking ability by GENDER

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_speaksenglishverywell_doesnotspeakenglishverywell_x-is-men_t-is-SPEAKENG.csv")
yearLabels <- c(1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: SPEAKENG by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_speaksenglishverywell_doesnotspeakenglishverywell_x-is-men_t-is-SPEAKENG.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: SPEAKENG by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

##################      English speaking ability by RACE

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_speaksenglishverywell_doesnotspeakenglishverywell_x-is-black_t-is-SPEAKENG.csv")
yearLabels <- c(1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: SPEAKENG by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_speaksenglishverywell_doesnotspeakenglishverywell_x-is-black_t-is-SPEAKENG.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: SPEAKENG by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      HCOVANY GENDER

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_nohealthinsurance_healthinsurance__x-is-men_t-is-HCOVANY.csv")
yearLabels <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Health Insurance Coverage by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###


##################      HCOVANY RACE

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_nohealthinsurance_healthinsurance__x-is-black_t-is-HCOVANY.csv")
yearLabels <- c(2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Health Insurance Coverage by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)
###

##################      Citizenship status of foreign born by RACE

datasetInTrainingGLOBAL <- 0

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_foreignborncitizen_foreignbornnoncitizen_x-is-black_t-is-CITIZEN.csv")
yearLabels <- c(1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Citizenship status CITIZEN of foreign born by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS:
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_foreignborncitizen_foreignbornnoncitizen_x-is-black_t-is-CITIZEN.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Citizenship status CITIZEN of foreign born by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)



##################      Citizenship status of foreign born by GENDER

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_foreignborncitizen_foreignbornnoncitizen_x-is-men_t-is-CITIZEN.csv")
yearLabels <- c(1980,1990,2000)

datasetLabel <- "IPUMS census (by county) data: Citizenship status CITIZEN of foreign born by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# by ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_foreignborncitizen_foreignbornnoncitizen_x-is-men_t-is-CITIZEN.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Citizenship status CITIZEN of foreign born by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)





##################      POVERTY by GENDER

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_menwomen_belowpovertythreshold_povertythreshold100orhigher__x-is-black_t-is-POVERTY.csv")
yearLabels <- c(1950,1960,1980,1990)

datasetLabel <- "IPUMS census (by county) data: POVERTY by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_menwomen_belowpovertythreshold_povertythreshold100orhigher__x-is-black_t-is-POVERTY.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: POVERTY by GENDER: %d: %s"
blabel <- "Men"
wlabel <- "Women"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      POVERTY by RACE

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_blackwhite_belowpovertythreshold_povertythreshold100orhigher__x-is-black_t-is-POVERTY.csv")
yearLabels <- c(1950,1960,1980,1990)

datasetLabel <- "IPUMS census (by county) data: POVERTY by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_blackwhite_belowpovertythreshold_povertythreshold100orhigher__x-is-black_t-is-POVERTY.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: POVERTY by RACE: %d: %s"
blabel <- "Blacks"
wlabel <- "Whites"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)


##################      Poverty status by citizenship status of foreign born

datasetInTrainingGLOBAL <- 1

dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_sample_census_year%d_foreignbornnoncitizen_foreignborncitizen_belowpovertythreshold_povertythreshold100orhigher__x-is-foreignbornnoncitizen_t-is-POVERTY.csv")
yearLabels <- c(1980,1990)

datasetLabel <- "IPUMS census (by county) data: Below poverty threshold POVERTY by Foreign born non-citizen vs foreign born citizen CITIZEN: %d: %s"
blabel <- "ForeignbornNonCitizen"
wlabel <- "ForeignbornCitizen"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)

# ACS
dataDir <- kINPUTDATADIR
dataFiles <- c("ipums_acs_census_year%d_foreignbornnoncitizen_foreignborncitizen_belowpovertythreshold_povertythreshold100orhigher__x-is-foreignbornnoncitizen_t-is-POVERTY.csv")
yearLabels <- c(2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016)

datasetLabel <- "IPUMS ACS (by county) data: Below poverty threshold POVERTY by Foreign born non-citizen vs foreign born citizen CITIZEN: %d: %s"
blabel <- "ForeignbornNonCitizen"
wlabel <- "ForeignbornCitizen"
updateWithIPUMSData(dataDir, dataFiles, yearLabels, datasetLabel, blabel, wlabel)




##############################################################################
# add unique id's and drop empty lines

# index 1 is the header
numberCompleteRows <- 1
for (i in 2:length(datasetSummaryDataframe$summary)) {
  if (datasetSummaryDataframe$summary[i] == "0") {
    break
  }
  numberCompleteRows <- numberCompleteRows + 1
}

datasetSummaryDataframe_final <- data.frame(c(numeric(numberCompleteRows)))
colnames(datasetSummaryDataframe_final)[1] <- "summary"
datasetSummaryDataframe_final$summary[1] <- sprintf("unique_id, bd, hbdl0, hbdu0, cil, cir, bdl, bdu, t3, ss.y, ss.reslo, ss.res2, p, w0, c1, d1, l, u, w30, w31, w32, w33, ss.res.loessWithTrueBetaB, ss.res.regBetaBVsX, regBetaBVsXvcov.tstat, regBetaBVsX.lmsummary.tstat, ss.res.loessWithTrueBetaW, ss.res.regBetaWVsX, regBetaWVsXvcov.tstat, regBetaWVsX.lmsummary.tstat, tb2, tw2, tb1, tw1, corTruebetaBandTruebetaW, dataset_label")

# index 1 is the header
for (i in 2:numberCompleteRows) {
  stopifnot(datasetSummaryDataframe$summary[i] != "0")
  datasetSummaryDataframe_final$summary[i] <- sprintf("%d, %s", i-1, datasetSummaryDataframe$summary[i])
}

print(sprintf("Total number of rows: %d",numberCompleteRows))

##############################################################################
# save data
write.csv(datasetSummaryDataframe_final, outputSaveFilepath, quote=FALSE, row.names = FALSE)
##############################################################################



